{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": true
   },
   "source": [
    "# numba 的基本用法\n",
    "\n",
    "## 使用 jit 加速 Python 低效的 for 语句"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "12.6 ms ± 935 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n",
      "6.33 ms ± 154 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n",
      "4.12 ms ± 850 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n",
      "True\n"
     ]
    }
   ],
   "source": [
    "import numba as nb\n",
    "import numpy as np\n",
    "\n",
    "# 普通的 for\n",
    "def add1(x, c):\n",
    "    rs = [0.] * len(x)\n",
    "    for i, xx in enumerate(x):\n",
    "        rs[i] = xx + c\n",
    "    return rs\n",
    "\n",
    "# list comprehension\n",
    "def add2(x, c):\n",
    "    return [xx + c for xx in x]\n",
    "\n",
    "# 使用 jit 加速后的 for\n",
    "@nb.jit(nopython=True)\n",
    "def add_with_jit(x, c):\n",
    "    rs = [0.] * len(x)\n",
    "    for i, xx in enumerate(x):\n",
    "        rs[i] = xx + c\n",
    "    return rs\n",
    "\n",
    "# jit 的错误使用姿势\n",
    "@nb.jit(nopython=True)\n",
    "def wrong_add(x, c):\n",
    "    rs = [0] * len(x)\n",
    "    for i, xx in enumerate(x):\n",
    "        rs[i] = xx + c\n",
    "    return rs\n",
    "\n",
    "y = np.random.random(10**5).astype(np.float32)\n",
    "x = y.tolist()\n",
    "\n",
    "assert np.allclose(add1(x, 1), add2(x, 1), add_with_jit(x, 1))\n",
    "%timeit add1(x, 1)\n",
    "%timeit add2(x, 1)\n",
    "%timeit add_with_jit(x, 1)\n",
    "print(np.allclose(wrong_add(x, 1), 1))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "+ 注意：\n",
    "    + `numba`不支持 list comprehension，详情可参见[这里](https://github.com/numba/numba/issues/504)\n",
    "    + `jit`会在某种程度上“预编译”你的代码，这意味着它会在某种程度上固定住各个变量的数据类型；所以在`jit`下定义数组时，如果想要使用的是`float`数组的话，就不能像上述`wrong_add`里那样用`[0] * len(x)`定义、而应该在`0`后面加一个小数点：`[0.] * len(x)`\n",
    "    + `jit`能够加速的不限于`for`，但一般而言加速`for`会比较常见、效果也比较显著。我在我实现的`numpy`版本的卷积神经网络（`CNN`）中用了`jit`后、可以把代码加速 **60 多倍**。具体代码可以参见[这里](https://github.com/carefree0910/MachineLearning/blob/master/NN/Basic/Layers.py#L9)，不过如果不想看源代码的话，可以参见[CNN(zh-cn).ipynb][1]，我在其中做了一些相应的、比较简单的实验\n",
    "\n",
    "[1]: https://github.com/carefree0910/MachineLearning/blob/master/Notebooks/numba/CNN(zh-cn).ipynb"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 使用 vectorize 实现 numpy 的 Ufunc 功能\n",
    "\n",
    "### vectorize 的基本应用"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "4.2 ms ± 172 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n",
      "24.1 µs ± 623 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)\n"
     ]
    }
   ],
   "source": [
    "assert np.allclose(y + 1, add_with_jit(x, 1))\n",
    "%timeit add_with_jit(x, 1)\n",
    "%timeit y + 1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "67.7 µs ± 626 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)\n",
      "63.8 µs ± 5.65 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)\n",
      "26.7 µs ± 3.55 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)\n",
      "25.3 µs ± 1.27 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)\n"
     ]
    }
   ],
   "source": [
    "@nb.vectorize(nopython=True)\n",
    "def add_with_vec(yy, c):\n",
    "    return yy + c\n",
    "\n",
    "assert np.allclose(y + 1, add_with_vec(y, 1), add_with_vec(y, 1.))\n",
    "%timeit add_with_vec(y, 1)\n",
    "%timeit add_with_vec(y, 1.)\n",
    "%timeit y + 1\n",
    "%timeit y + 1."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### vectorize 的“并行”版本"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "83.3 µs ± 1.82 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)\n",
      "28.8 µs ± 1.92 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)\n"
     ]
    }
   ],
   "source": [
    "@nb.vectorize(\"float32(float32, float32)\", target=\"parallel\", nopython=True)\n",
    "def add_with_vec(y, c):\n",
    "    return y + c\n",
    "\n",
    "assert np.allclose(y+1, add_with_vec(y,1.))\n",
    "%timeit add_with_vec(y, 1.)\n",
    "%timeit y + 1"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "+ 注意：虽然这里使用`parallel`后更慢了，但如果使用 **Intel Distribution for Python** 的话，会发现`parallel`版本甚至会比`numpy`原生的版本要稍快一些"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "107 µs ± 3.25 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)\n",
      "130 µs ± 17.5 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)\n",
      "450 µs ± 8.02 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)\n"
     ]
    }
   ],
   "source": [
    "@nb.vectorize(\"float32(float32, float32, float32)\", target=\"parallel\", nopython=True)\n",
    "def clip_with_parallel(y, a, b):\n",
    "    if y < a:\n",
    "        return a\n",
    "    if y > b:\n",
    "        return b\n",
    "    return y\n",
    "\n",
    "@nb.vectorize(\"float32(float32, float32, float32)\", nopython=True)\n",
    "def clip(y, a, b):\n",
    "    if y < a:\n",
    "        return a\n",
    "    if y > b:\n",
    "        return b\n",
    "    return y\n",
    "\n",
    "assert np.allclose(np.clip(y, 0.1, 0.9), clip(y, 0.1, 0.9), clip_with_parallel(y, 0.1, 0.9))\n",
    "%timeit clip_with_parallel(y, 0.1, 0.9)\n",
    "%timeit clip(y, 0.1, 0.9)\n",
    "%timeit np.clip(y, 0.1, 0.9)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "+ 注意：这个栗子中的性能提升就是实打实的了"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "总之，使用`parallel`时不能一概而论，还是要做些实验。\n",
    "\n",
    "需要指出的是，`vectorize`中的参数`target`一共有三种取值：`cpu`（默认）、`parallel`和`cuda`。关于选择哪个取值，官方文档上有很好的说明：\n",
    "```text\n",
    "A general guideline is to choose different targets for different data sizes and algorithms. The “cpu” target works well for small data sizes (approx. less than 1KB) and low compute intensity algorithms. It has the least amount of overhead. The “parallel” target works well for medium data sizes (approx. less than 1MB). Threading adds a small delay. The “cuda” target works well for big data sizes (approx. greater than 1MB) and high compute intensity algorithms. Transfering memory to and from the GPU adds significant overhead.\n",
    "```"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 使用 jit(nogil=True) 实现高效并发（多线程）"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "14.9 ms ± 1.15 ms per loop (mean ± std. dev. of 7 runs, 100 loops each)\n",
      "8.7 ms ± 794 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n",
      "10.2 ms ± 434 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n",
      "7.62 ms ± 37 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n",
      "3.86 ms ± 60.3 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
     ]
    }
   ],
   "source": [
    "import math\n",
    "from concurrent.futures import ThreadPoolExecutor\n",
    "\n",
    "def np_func(a, b):\n",
    "    return 1 / (a + np.exp(-b))\n",
    "\n",
    "@nb.jit(nopython=True, nogil=False)\n",
    "def kernel1(result, a, b):\n",
    "    for i in range(len(result)):\n",
    "        result[i] = 1 / (a[i] + math.exp(-b[i]))\n",
    "                \n",
    "@nb.jit(nopython=True, nogil=True)\n",
    "def kernel2(result, a, b):\n",
    "    for i in range(len(result)):\n",
    "        result[i] = 1 / (a[i] + math.exp(-b[i]))\n",
    "\n",
    "def make_single_task(kernel):\n",
    "    def func(length, *args):\n",
    "        result = np.empty(length, dtype=np.float32)\n",
    "        kernel(result, *args)\n",
    "        return result\n",
    "    return func\n",
    "\n",
    "def make_multi_task(kernel, n_thread):\n",
    "    def func(length, *args):\n",
    "        result = np.empty(length, dtype=np.float32)\n",
    "        args = (result,) + args\n",
    "        chunk_size = (length + n_thread - 1) // n_thread\n",
    "        chunks = [[arg[i*chunk_size:(i+1)*chunk_size] for i in range(n_thread)] for arg in args]\n",
    "        with ThreadPoolExecutor(max_workers=n_thread) as e:\n",
    "            for _ in e.map(kernel, *chunks):\n",
    "                pass\n",
    "        return result\n",
    "    return func\n",
    "\n",
    "length = 10 ** 6\n",
    "a = np.random.rand(length).astype(np.float32)\n",
    "b = np.random.rand(length).astype(np.float32)\n",
    "\n",
    "nb_func1 = make_single_task(kernel1)\n",
    "nb_func2 = make_multi_task(kernel1, 4)\n",
    "nb_func3 = make_single_task(kernel2)\n",
    "nb_func4 = make_multi_task(kernel2, 4)\n",
    "\n",
    "rs_np = np_func(a, b)\n",
    "rs_nb1 = nb_func1(length, a, b)\n",
    "rs_nb2 = nb_func2(length, a, b)\n",
    "rs_nb3 = nb_func3(length, a, b)\n",
    "rs_nb4 = nb_func4(length, a, b)\n",
    "assert np.allclose(rs_np, rs_nb1, rs_nb2, rs_nb3, rs_nb4)\n",
    "%timeit np_func(a, b)\n",
    "%timeit nb_func1(length, a, b)\n",
    "%timeit nb_func2(length, a, b)\n",
    "%timeit nb_func3(length, a, b)\n",
    "%timeit nb_func4(length, a, b)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "+ 注意：一般来说，数据量越大、并发的效果越明显。反之，数据量小的时候，并发很有可能会降低性能"
   ]
  }
 ],
 "metadata": {
  "anaconda-cloud": {},
  "kernelspec": {
   "display_name": "Python [conda env:playground]",
   "language": "python",
   "name": "conda-env-playground-py"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.1"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}
